[Deep_Learning]Word_Embedding(Kor)¶

FastText wiki pre_trained_model¶

from __future__ import print_function
from gensim.models import KeyedVectors

# 위키 vector 모델 로드
ko_model = KeyedVectors.load_word2vec_format('wiki.ko.vec')

ko_model.save('ko_model')

# 필요없는 메모리 unload
ko_model.init_sims(replace=True)

# words 리스트에 추가
words = []
for word in ko_model.vocab:
    words.append(word)

find_similar_to = '사랑'

# '사랑'에 가장 유사한 단어 10개 추출
for similar_word in ko_model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: 사랑사랑, Similarity: 0.81
Word: 사랑치, Similarity: 0.78
Word: 사랑일, Similarity: 0.77
Word: 사랑느낌, Similarity: 0.76
Word: 사랑이었네, Similarity: 0.76
Word: 사랑이여, Similarity: 0.75
Word: 사랑병, Similarity: 0.75
Word: 사랑인, Similarity: 0.75
Word: 사랑맛, Similarity: 0.75
Word: 사랑노래, Similarity: 0.74

word_add = ['동물', '파충류']
word_sub = ['뱀']

# 네거티브 샘플링
for resultant_word in ko_model.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : 포유류 , Similarity: 0.72
Word : 포유동물 , Similarity: 0.71
Word : 절지동물 , Similarity: 0.69
Word : 양서류 , Similarity: 0.69
Word : 독동물 , Similarity: 0.69
Word : 포유류분류 , Similarity: 0.68
Word : 무척추동물 , Similarity: 0.68
Word : 척추동물분류 , Similarity: 0.68
Word : 도시동물 , Similarity: 0.68
Word : 동물상 , Similarity: 0.67

similarities = ko_model.most_similar(positive=['동물', '파충류'], negative=['뱀'])

print(similarities)

[('포유류', 0.7234190702438354), ('포유동물', 0.7082793712615967), ('절지동물', 0.6905428171157837), ('양서류', 0.6887608766555786), ('독동물', 0.6857677698135376), ('포유류분류', 0.6800143718719482), ('무척추동물', 0.6791884899139404), ('척추동물분류', 0.6789263486862183), ('도시동물', 0.6775411367416382), ('동물상', 0.6730656623840332)]

# 유사도가 떨어지는 단어 찾기
not_matching = ko_model.doesnt_match("아침 점심 저녁 된장국".split())

C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)

print(not_matching)

된장국

# 유사도 점수
sim_score = ko_model.similarity('컴퓨터', '인간')

print(sim_score)

0.4248201

sim_score = ko_model.similarity('로봇', '인간')

print(sim_score)

0.4782262

sim_score = ko_model.similarity('사랑해', '사랑의')

print(sim_score)

0.5480147

# 가장 유사한 단어 10개 추출
print(ko_model.most_similar('전자'))

[('전자빔', 0.7568391561508179), ('전자렌지', 0.7566049098968506), ('전자기기', 0.7503113746643066), ('전자양', 0.7480576634407043), ('가전자', 0.7460261583328247), ('전자악기', 0.7431635856628418), ('전자기기와', 0.7412865161895752), ('전자기계', 0.7339802980422974), ('전자만', 0.7331153154373169), ('전자적인', 0.7289555072784424)]

FastText wiki pre_trained_model¶

from gensim.models import FastText

ft2 = FastText.load_fasttext_format('wiki.ko.bin')

C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `load_fasttext_format` (use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead).
  """Entry point for launching an IPython kernel.

ko_model.save('ft2')

ft2.init_sims(replace=True)

find_similar_to = '사랑'

for similar_word in ft2.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `similar_by_word` (Method will be removed in 4.0.0, use self.wv.similar_by_word() instead).
  """Entry point for launching an IPython kernel.

Word: 사랑사랑, Similarity: 0.81
Word: 사랑치, Similarity: 0.78
Word: 사랑일, Similarity: 0.77
Word: 사랑느낌, Similarity: 0.76
Word: 사랑이었네, Similarity: 0.76
Word: 사랑이여, Similarity: 0.75
Word: 사랑병, Similarity: 0.75
Word: 사랑인, Similarity: 0.75
Word: 사랑맛, Similarity: 0.75
Word: 사랑노래, Similarity: 0.74

word_add = ['동물', '파충류']
word_sub = ['뱀']

for resultant_word in ft2.wv.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : 포유류 , Similarity: 0.72
Word : 포유동물 , Similarity: 0.71
Word : 절지동물 , Similarity: 0.69
Word : 양서류 , Similarity: 0.69
Word : 독동물 , Similarity: 0.69
Word : 포유류분류 , Similarity: 0.68
Word : 무척추동물 , Similarity: 0.68
Word : 척추동물분류 , Similarity: 0.68
Word : 도시동물 , Similarity: 0.68
Word : 동물상 , Similarity: 0.67

similarities = ft2.wv.most_similar(positive=['동물', '파충류'], negative=['뱀'])

print(similarities)

[('포유류', 0.7234194874763489), ('포유동물', 0.7082797884941101), ('절지동물', 0.6905421018600464), ('양서류', 0.6887608766555786), ('독동물', 0.6857682466506958), ('포유류분류', 0.6800158023834229), ('무척추동물', 0.6791882514953613), ('척추동물분류', 0.6789271831512451), ('도시동물', 0.6775408983230591), ('동물상', 0.6730659008026123)]

not_matching = ft2.wv.doesnt_match("아침 점심 저녁 된장국".split())

C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)

print(not_matching)

된장국

sim_score = ft2.wv.similarity('컴퓨터', '인간')

print(sim_score)

0.42482015

sim_score = ft2.wv.similarity('로봇', '인간')

print(sim_score)

0.47822645

sim_score = ft2.wv.similarity('사랑해', '사랑의')

print(sim_score)

0.54801536

print(ft2.wv.most_similar('전자'))

[('전자빔', 0.7568387985229492), ('전자렌지', 0.7566049695014954), ('전자기기', 0.750311017036438), ('전자양', 0.7480573654174805), ('가전자', 0.7460258603096008), ('전자악기', 0.743162989616394), ('전자기기와', 0.7412854433059692), ('전자기계', 0.7339791059494019), ('전자만', 0.7331157922744751), ('전자적인', 0.7289554476737976)]

Word2Vec wiki pre_trained_model¶

import gensim

ko_w2v = gensim.models.Word2Vec.load('ko.bin')

ko_w2v.init_sims(replace=True)

similarities_wv = ko_w2v.wv.most_similar(positive=['동물', '파충류'], negative=['뱀'])

print(similarities_wv)

[('생물', 0.6952868700027466), ('영장류', 0.6766470670700073), ('조류', 0.6660945415496826), ('양서류', 0.6637342572212219), ('포유류', 0.659113347530365), ('설치류', 0.636635422706604), ('무척추', 0.6241835355758667), ('어류', 0.6236225366592407), ('절지', 0.6208628416061401), ('곤충', 0.6167744398117065)]

sim_score_wv = ko_w2v.wv.similarity('컴퓨터', '인간')

print(sim_score_wv)

0.21644185

sim_score_wv = ko_w2v.wv.similarity('로봇', '인간')

print(sim_score_wv)

0.40642476

print(ko_w2v.wv.most_similar(positive=["전자"], topn=10))

[('반도체', 0.6502741575241089), ('양전자', 0.6052197217941284), ('복사기', 0.5808517336845398), ('음전하', 0.5768587589263916), ('원자가', 0.5756815671920776), ('음극', 0.5747135281562805), ('양전하', 0.5658353567123413), ('절연체', 0.5621837377548218), ('상거래', 0.5594459772109985), ('광자', 0.5468275547027588)]

GloVe wiki pre_trained_model¶

from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix

corpus_path = '2016-10-20_article_all_normed.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

# text 파일로 매트릭스 생성
x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)
print(x.shape)

Create (word, contexts) matrix
  - counting word frequency from 223356 sents, mem=3.380 Gb
  - scanning (word, context) pairs from 223356 sents, mem=3.899 Gb
  - (word, context) matrix was constructed. shape = (50091, 50091)                    
  - done
(50091, 50091)

from glove import Glove

# 모델 생성
glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
glove.fit(x.tocoo(), epochs=5, no_threads=4, verbose=True)

Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4

# 단어 사전 만들기
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)

print(glove.most_similar('사랑', number=10))

[('길방', 0.9523622311165693), ('이문세의', 0.7709299671167765), ('일과', 0.7067012792878986), ('백선생2', 0.6770944696844872), ('월드지수', 0.6588736838372226), ('취미는', 0.6578404706277452), ('삼성화재가', 0.64967057830683), ('마스터', 0.6404152929079104), ('뽀꼬', 0.6362728223085667)]

print(glove.most_similar('동물', number=10))

[('애호', 0.8991982603293193), ('뉴욕에서', 0.8788155717188214), ('대사관', 0.8728831818746616), ('캐나다', 0.8563384980254631), ('아마존', 0.8560863828784364), ('태국', 0.8543265546091593), ('전역에서', 0.844629117471427), ('텍사스', 0.8327472727116574), ('등에서도', 0.814652839733725)]

print(glove.most_similar('로봇', number=10))

[('전문기자', 0.8650349594862273), ('기사입니다', 0.8502601533196349), ('풍선효과', 0.8335515748947783), ('명물로', 0.8163664744891702), ('증시분석', 0.807058672084537), ('등장했다고', 0.8057628957506581), ('지정하는', 0.8020943361293313), ('해안가', 0.794010006944613), ('씽크풀의', 0.785495111290533)]

kor2vec pre_trained_model¶

#from kor2vec import Kor2Vec

#kor2vec = Kor2Vec(embed_size=128)

#kor2vec.train("2016-10-20_article_all_normed.txt")